# Start Python Imports
import math, time, random, datetime
# Data Manipulation
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize, StandardScaler
from sklearn.preprocessing import MinMaxScaler
# Machine learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
# Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report,roc_curve, auc
# Tensor Flow and Keras
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
# loading data set
data_raw = pd.read_csv("bank.csv")
data_raw.head(10)
# making copy of raw data to do preprocesing
data = pd.DataFrame.copy(data_raw)
# shape of data set
data.shape
# description of all variables
data.info()
# Check null data
data.isnull().sum()
#Descripbe data
data.describe().T
data.head(10)
# creating numerical, categorical & target columns lists
categorical_features = [features for features in data.columns if data[features].dtypes == "O"]
print(len(categorical_features),"no of categorical variables")
numerical_features = ["CreditScore", "Age", "Balance", "EstimatedSalary"]
print(len(numerical_features),"no of numerical variables")
discrete_features = ["Tenure","NumOfProducts","HasCrCard","IsActiveMember"]
print(len(discrete_features),"no of discrete variables")
target = ["Exited"]
print("target has",len(target),"variable")
# checking all unique values of categorical variable
for feature in categorical_features:
print(feature,"has",len(data[feature].value_counts()),"unique values","\n", data[feature].value_counts())
# checking all unique values of discrete variable
for feature in discrete_features:
print(feature,"has",len(data[feature].value_counts()),"unique values","\n", data[feature].value_counts())
# EDA
# need to check the distribution of output classes
# Output classes need to be balanced
data_EDA = pd.DataFrame(data)
sns.countplot(x="Exited", data=data_EDA)
plt.title("Distribution of output classes")
# Univariate Analysis using histogram
# understanding more in histogram of caontinous variables
for features in numerical_features:
data_EDA = data.copy()
sns.distplot(data_EDA[features], hist=True, kde=True, rug=True)
plt.xlabel(features)
plt.xticks(rotation=90)
plt.title("Distribution of numerical variables")
plt.show()
#Age variable has right skewed distribution which means most customers are 25-50 years age group.
# Lot of customers have zero account balance.
# relationship between numerical variable vs target
for feature in numerical_features:
data_EDA=data.copy()
sns.barplot(x="Exited", y=feature, data=data_EDA, order=[1,0], palette=colors1)
plt.xlabel('Exited')
plt.xticks(rotation=90)
plt.ylabel(feature)
plt.title(feature)
plt.show()
#The average age group of exited customers is 45 years
# 650 is avearge credit score of existed customer
# 86000 is average balance & 100000 is average salary of exited cust
# Percentage of customer churn is ~20.4%
value_counts = pd.value_counts(data_EDA['Exited'])
plt.figure(figsize = (6,6))
value_counts.plot(kind = 'pie', explode = [0,0.1],autopct='%1.1f%%', shadow=True)
plt.title('Proportion of customer churned and retained')
plt.show()
value_counts
# Multivariate Analysis
# checking relationship between categorical variables and target
for feature in categorical_features + discrete_features:
data_EDA=data.copy()
sns.countplot(x='Exited', hue=feature, data=data_EDA, order=[1,0])
plt.xlabel('Exited')
plt.xticks(rotation=90)
plt.ylabel('Count')
plt.title(feature)
plt.show()
#Germans and french people more likely to exit
# Women have more exit rate
# Customer who has credit card are more likely to leave
# Not active customer is leaving
# One year old customers are more likely to leave
# corelation plot
sns.pairplot(data_EDA, kind="scatter", hue="Exited", plot_kws=dict(s=120, edgecolor="white", linewidth=2.5))
plt.show()
plt.figure(figsize = (15,15))
sns.heatmap(data_EDA.corr(), annot = True, cmap = 'RdYlGn')
# data preparation...creating independent and dependent features
# we do not consider Row Number and Customer Id as they are not needed .
X = data_EDA.iloc[:,3:-1].values
y = data_EDA.iloc[:,-1].values
X
y
# Label encoding Gender
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:, 2] = le.fit_transform(X[:, 2])
print(X)
# one hot encoding Geography
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X
y
# splitting train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)
# Feature scaling using Standard scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Modeliing , building with ANN
import tensorflow as tf
# initialize ANN with sequential model
ann = tf.keras.models.Sequential()
# adding input and first hidden layer use relu
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
# adding second hidden laer using relu
ann.add(tf.keras.layers.Dense(units = 6, activation = 'relu'))
# adding the output layer
ann.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))
# Compile the ANN
ann.compile(optimizer ='adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# Fitting ANN to train set with batch size 10 and epocs as 50
model_history = ann.fit(X_train, y_train,validation_split=0.33,batch_size = 10, epochs = 50)
# Visualize accuracy of model
plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# vizualize the loss
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()
# Predict accuracy with threshold of 0.5
y_pred = ann.predict(X_test)
y_pred =(y_pred > 0.5)
# create confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))
cr=metrics.classification_report(y_test,y_pred)
print(cr)
# tried with my laptop to run multiple batch sizez and epochs but it was taking too long to execute
# ended up with {'batch_size': 10, 'epochs': 150, 'optimizers': 'Adam'}
# 5 fold validation with n_jobs=-3 for faster speed.
from sklearn.model_selection import GridSearchCV
def build_classifier(lrs):
classifier = Sequential()
classifier.add(Dense(6, activation = "relu"))
classifier.add(Dense(1, activation = "sigmoid"))
classifier.compile(optimizer = keras.optimizers.Adam(lr = lrs), loss = "binary_crossentropy", metrics = ["accuracy"])
return classifier
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 10, epochs = 150)
parameters = {'lrs':[ 0.001, 0.003]}
grid_search = GridSearchCV(estimator = classifier,
param_grid = parameters,
scoring = 'accuracy',
cv = 5 ,n_jobs=-3)
grid_search = grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_
display(best_parameters)
display(best_accuracy)
# using above parameters to create new model
# initialize ANN with sequential model
ann_model2 = tf.keras.models.Sequential()
# adding input and first hidden layer use relu
ann_model2.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann_model2.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
# Compile the ANN
sgd_optimizer = tf.keras.optimizers.SGD(lr=0.001)
ann_model2.compile(optimizer = sgd_optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
# Fitting ANN to train set with batch size 10 and epocs as 150
model2_history = ann_model2.fit(X_train, y_train,validation_split=0.33,batch_size = 10, epochs = 150)
# Visualize accuracy of model
plt.plot(model2_history.history['accuracy'])
plt.plot(model2_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# vizualize the loss
plt.plot(model2_history.history['loss'])
plt.plot(model2_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()
# Predict accuracy with threshold of 0.5
y_pred = ann_model2.predict(X_test)
y_pred =(y_pred > 0.5)
# create confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))
cr=metrics.classification_report(y_test,y_pred)
print(cr)
# with the change in the hyper params we are able to improve the model accuracy from 0.852 to 0.862
#Germans and french people more likely to exit
# Women have more exit rate
# Customer who has credit card are more likely to leave
# Not active customer is leaving
# One year old customers are more likely to leave
#The average age group of exited customers is 45 years
# 650 is avearge credit score of existed customer
# 86000 is average balance & 100000 is average salary of exited cust.
# Lot of customers have zero account balance.